import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
from chart_studio.plotly import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
data = pd.read_csv('preprocessed_data.csv', nrows=50000)
data.head(2)
data.columns
data['project_is_approved'].value_counts()
y = data['project_is_approved'].values
X = data.drop(['project_is_approved'], axis=1)
X.head(2)
X['preprocessed_essays'][0]
X['preprocessed_titles'][0]
X['combined']=X['preprocessed_essays']+' '+X['preprocessed_titles']
X['combined'][0]
X.head(2)
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)
print("Shape of train & test data:")
print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer1 = TfidfVectorizer(min_df=10,ngram_range=(1,2), max_features=5000)
vectorizer1.fit(X_train['combined'].values.astype('U'))
X_train_comb_tfidf = vectorizer1.transform(X_train['combined'].values.astype('U'))
f1=vectorizer1.get_feature_names()
print("After vectorization")
print(X_train_comb_tfidf.shape, y_train.shape)
print("="*100)
print(len(vectorizer1.idf_))
print(len(f1))
idf= list(vectorizer1.idf_)
indices = np.argsort(idf)[::-1] #https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
names = vectorizer1.get_feature_names()
top2k_idf=[idf[i] for i in indices[0:2000] ]
top2k_words=[names[i] for i in indices[0:2000] ]
print(len(top2k_idf))
print(len(top2k_words))
words_idf= list(zip(top2k_words,top2k_idf))
type(words_idf)
#top 20 words
print(words_idf[:20])
#ref:https://datascience.stackexchange.com/questions/40038/how-to-implement-word-to-word-co-occurence-matrix-in-python
unique=top2k_words;
coo_mat = np.zeros((len(unique), len(unique)))
context = []
window_size = 5
for text in X_train['combined']:
words = str(text).split(' ')
for i, _ in enumerate(words):
context.append(words[i])
if len(context) > (window_size * 2) + 1:
context.pop(0)
pos = int(len(context) / 2)
for j, _ in enumerate(context):
if context[j] in unique and words[i] in unique:
coo_mat[unique.index(context[j]), unique.index(words[i])] += 1
np.fill_diagonal( coo_mat, 0 )
coo_mat_df = pd.DataFrame(coo_mat)
coo_mat_df.index = unique
coo_mat_df.columns = unique
coo_mat_df.head(5)
print(coo_mat.shape)
type(coo_mat)
np.save('coo_mat', coo_mat)
coo_mat = np.load('coo_mat.npy')
#elbow method
# initializing the SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=1999)
svd.fit(coo_mat)
percentage_var_explained = svd.explained_variance_/ np.sum(svd.explained_variance_);
cum_var_explained = np.cumsum(percentage_var_explained)
# Plot the SVD spectrum
plt.figure(1, figsize=(6, 4))
plt.clf()
plt.plot(cum_var_explained, linewidth=2)
plt.axis('tight')
plt.grid()
plt.xlabel('n_components')
plt.ylabel('Cumulative_explained_variance')
plt.title("Elbow graph")
plt.show()
from sklearn.decomposition import TruncatedSVD
svd1 = TruncatedSVD(n_components= 400 )
truncated_coo= svd1.fit_transform(coo_mat)
print("After reduction")
print(truncated_coo.shape)
np.save('truncated_coo', truncated_coo)
#truncated_coo = np.load('truncated_coo.npy')
# storing the truncated co-occurence matrix in a dataframe
#index = coo_mat.index
tr_df = pd.DataFrame(truncated_coo, index = top2k_words)
tr_df.head()
word_vec= dict()
a=0
for i in tr_df.values:
word_vec[tr_df.index[a]]=i
a+=1
features=word_vec.keys()
def avgw2v(data, features):
V = [] # average word 2 vec for each essay
for text in tqdm(data):
svec = np.zeros(400)
count = 0
for word in str(text).split():
if word in features:
vec = word_vec[word] # Extracting the vector for the word from the dictionary
svec += vec # Adding the vectors for all the words
count += 1
if count != 0:
svec /= count # Taking the average
V.append(svec)
return V
X_tr_avg = np.asarray(avgw2v(X_train['combined'], features))
print(X_tr_avg.shape)
X_test_avg = np.asarray(avgw2v(X_test['combined'], features))
print(X_test_avg.shape)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['school_state'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_state = vectorizer.transform(X_train['school_state'].values)
X_test_state = vectorizer.transform(X_test['school_state'].values)
f5=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_state.shape, y_train.shape)
print(X_test_state.shape, y_test.shape)
print(f5)
print("="*100)
X_train['teacher_prefix'].unique()
vectorizer = CountVectorizer()
vectorizer.fit(X_train['teacher_prefix'].values)
X_train_teacher = vectorizer.transform(X_train['teacher_prefix'].values)
X_test_teacher = vectorizer.transform(X_test['teacher_prefix'].values)
f6=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_teacher.shape, y_train.shape)
print(X_test_teacher.shape, y_test.shape)
print(f6)
print("="*100)
#This step is to intialize a vectorizer with vocab from train data
#Ref: https://www.kaggle.com/shashank49/donors-choose-knn#Concatinating-all-features-(TFIDF)
from collections import Counter
my_counter = Counter()
for word in X_train['project_grade_category'].values:
my_counter.update([word[i:i+14] for i in range(0, len(word),14)]) #https://www.geeksforgeeks.org/python-string-split/
# dict sort by value python: https://stackoverflow.com/a/613218/4084039
project_grade_category_dict = dict(my_counter)
sorted_project_grade_category_dict = dict(sorted(project_grade_category_dict.items(), key=lambda kv: kv[1]))
vectorizer = CountVectorizer(vocabulary=list(sorted_project_grade_category_dict.keys()), lowercase=False, binary=True,max_features=4)
vectorizer.fit(X_train['project_grade_category'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_grade = vectorizer.transform(X_train['project_grade_category'].values)
X_test_grade = vectorizer.transform(X_test['project_grade_category'].values)
f7=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_grade.shape, y_train.shape)
print(X_test_grade.shape, y_test.shape)
print(f7)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['clean_categories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_cat = vectorizer.transform(X_train['clean_categories'].values)
X_test_cat = vectorizer.transform(X_test['clean_categories'].values)
f8=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_cat.shape, y_train.shape)
print(X_test_cat.shape, y_test.shape)
print(f8)
print("="*100)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['clean_subcategories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_subcat = vectorizer.transform(X_train['clean_subcategories'].values)
X_test_subcat = vectorizer.transform(X_test['clean_subcategories'].values)
f9=vectorizer.get_feature_names()
print("After vectorizations")
print(X_train_subcat.shape, y_train.shape)
print(X_test_subcat.shape, y_test.shape)
print(f9)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer1 = Normalizer()
# normalizer.fit(X_train['price'].values)
#this will rise an error Expected 2D array, got 1D array instead:
normalizer1.fit(X_train['price'].values.reshape(-1,1))
X_train_price_norm = normalizer1.transform(X_train['price'].values.reshape(-1,1))
X_test_price_norm = normalizer1.transform(X_test['price'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_price_norm.shape, y_train.shape)
print(X_test_price_norm.shape, y_test.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train['quantity'].values.reshape(1,-1))
X_train_quantity_norm = normalizer.transform(X_train['quantity'].values.reshape(-1,1))
X_test_quantity_norm = normalizer.transform(X_test['quantity'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_quantity_norm.shape, y_train.shape)
print(X_test_quantity_norm.shape, y_test.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(1,-1))
X_train_projects_norm = normalizer.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_test_projects_norm = normalizer.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_projects_norm.shape, y_train.shape)
print(X_test_projects_norm.shape, y_test.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train['sentimental_score'].values.reshape(1,-1))
X_train_senti_norm = normalizer.transform(X_train['sentimental_score'].values.reshape(-1,1))
X_test_senti_norm = normalizer.transform(X_test['sentimental_score'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_senti_norm.shape, y_train.shape)
print(X_test_senti_norm.shape, y_test.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train['preprocessed_essay_word_count'].values.reshape(1,-1))
X_train_ewc_norm = normalizer.transform(X_train['preprocessed_essay_word_count'].values.reshape(-1,1))
X_test_ewc_norm = normalizer.transform(X_test['preprocessed_essay_word_count'].values.reshape(-1,1))
print("After vectorization")
print(X_train_ewc_norm.shape, y_train.shape)
print(X_test_ewc_norm.shape, y_test.shape)
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train['preprocessed_title_word_count'].values.reshape(1,-1))
X_train_twc_norm = normalizer.transform(X_train['preprocessed_title_word_count'].values.reshape(-1,1))
X_test_twc_norm = normalizer.transform(X_test['preprocessed_title_word_count'].values.reshape(-1,1))
print("After vectorization")
print(X_train_twc_norm.shape, y_train.shape)
print(X_test_twc_norm.shape, y_test.shape)
print("="*100)
from scipy.sparse import hstack
X_tr_avgw2v = hstack((X_tr_avg, X_train_state, X_train_teacher, X_train_grade, X_train_cat, X_train_subcat, X_train_price_norm, X_train_quantity_norm, X_train_projects_norm,X_train_senti_norm,X_train_ewc_norm,X_train_twc_norm)).tocsr()
X_test_avgw2v = hstack((X_test_avg, X_test_state, X_test_teacher, X_test_grade, X_test_cat, X_test_subcat, X_test_price_norm, X_test_quantity_norm, X_test_projects_norm,X_test_senti_norm,X_test_ewc_norm,X_test_twc_norm)).tocsr()
print("Final Data Matrix")
print(X_tr_avgw2v.shape, y_train.shape)
print(X_test_avgw2v.shape, y_test.shape)
# https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format
from scipy import sparse
sparse.save_npz("X_tr_avgw2v.npz", X_tr_avgw2v)
sparse.save_npz("X_test_avgw2v.npz", X_test_avgw2v)
#X_tr_avgw2v = sparse.load_npz("X_tr_avgw2v.npz")
#X_test_avgw2v = sparse.load_npz("X_test_avgw2v.npz")
#https://www.geeksforgeeks.org/numpy-save/
np.save('y_train', y_train)
np.save('y_test', y_test)
def batch_predict(clf, data):
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_data_pred = []
pred_labels=[]
tr_loop = data.shape[0] - data.shape[0]%1000;
# consider you X_tr shape is 49041, then your tr_loop will be 49041 - 49041%1000 = 49000
# in this for loop we will iterate until the last 1000 multiplier
for i in range(0, tr_loop, 1000):
y_data_pred.extend(clf.predict_proba(data[i:i+1000])[:,1]) # we will be predicting for the last data points
pred_labels.extend(clf.predict(data[i:i+1000]))
if data.shape[0]%1000 !=0:
y_data_pred.extend(clf.predict_proba(data[tr_loop:])[:,1])
pred_labels.extend(clf.predict(data[tr_loop:]))
return y_data_pred,pred_labels
## we will pick a threshold that will give the least fpr
def find_best_threshold(threshold, fpr, tpr):
t = threshold[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("The maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)),"for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshold):
predictions = []
for i in proba:
if i>=threshold:
predictions.append(1)
else:
predictions.append(0)
return predictions
#function to get heatmap of confusion matrix
# Reference: https://stackoverflow.com/questions/35572000/how-can-i-plot-a-confusion-matrix
def cm_heatmap(cm):
#y_pred = clf.predict(X_te)
df_cm = pd.DataFrame(cm, range(2),range(2))
df_cm.columns = ['Predicted NO','Predicted YES']
df_cm = df_cm.rename({0: 'Actual NO', 1: 'Actual YES'})
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, annot=True,annot_kws={"size": 16}, fmt='d')
#https://dask-ml.readthedocs.io/en/stable/modules/generated/dask_ml.xgboost.XGBClassifier.html
#https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
xgb = XGBClassifier()
parameters = {'n_estimators': [4, 8, 16, 32, 64, 100],'max_depth': [4, 6, 8, 10, 20, 25]}
model = RandomizedSearchCV(xgb, parameters, cv=5, scoring='roc_auc',return_train_score=True,n_jobs=-1)
rs1 = model.fit(X_tr_avgw2v, y_train)
dfm=pd.DataFrame(model.cv_results_)
dfm.head(2)
dfm.to_csv('hyp.csv')
dfm=pd.read_csv('hyp.csv')
%matplotlib inline
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import numpy as np
def enable_plotly_in_cell():
import IPython
from plotly.offline import init_notebook_mode
display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
init_notebook_mode(connected=False)
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=dfm['param_n_estimators'],y=dfm['param_max_depth'],z=dfm['mean_train_score'], name = 'train')
trace2 = go.Scatter3d(x=dfm['param_n_estimators'],y=dfm['param_max_depth'],z=dfm['mean_test_score'], name = 'Cross validation')
data = [trace1, trace2]
enable_plotly_in_cell()
layout = go.Layout(scene = dict(
xaxis = dict(title='Estimators'),
yaxis = dict(title='Max_depth'),
zaxis = dict(title='AUC'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
print(model.best_estimator_)
print('Score on train data :', {model.score(X_tr_avgw2v,y_train)})
print('Mean cross-validated score of the best_estimator :', {model.best_score_})
best_parameters = {'n_estimators': [32],'max_depth': [4]}
xg_best= XGBClassifier(n_estimators= 32 , max_depth= 4)
xg_best.fit(X_tr_avgw2v, y_train)
y_train_pred_avg_best,pred_labels_train = batch_predict(xg_best, X_tr_avgw2v)
y_test_pred_avg_best,pred_labels_test = batch_predict(xg_best, X_test_avgw2v)
train_tpr_avg, train_fpr_avg, tr_thresholds_avg = roc_curve(y_train, y_train_pred_avg_best)
test_tpr_avg, test_fpr_avg, te_thresholds_avg = roc_curve(y_test, y_test_pred_avg_best)
plt.plot(train_tpr_avg, train_fpr_avg,label="Train AUC ="+str(auc(train_tpr_avg, train_fpr_avg)))
plt.plot(test_tpr_avg, test_fpr_avg, label="Test AUC ="+str(auc(test_tpr_avg, test_fpr_avg)))
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.grid()
plt.show()
from sklearn.metrics import confusion_matrix
best_t_avg = find_best_threshold(tr_thresholds_avg, train_fpr_avg, train_tpr_avg)
print("Train confusion matrix")
cm_train_avg=confusion_matrix(y_train, predict_with_best_t(y_train_pred_avg_best, best_t_avg))
print(cm_train_avg)
print("Test confusion matrix")
cm_test_avg=confusion_matrix(y_test, predict_with_best_t(y_test_pred_avg_best, best_t_avg))
print(cm_test_avg)
# confusion matrix heatmap for train data
print("Train confusion matrix heatmap")
cm_heatmap(cm_train_avg)
# confusion matrix heatmap for test data
print("Test confusion matrix heatmap")
cm_heatmap(cm_test_avg)
#Ref: http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer","Model","max_depth","n_estimators" ,"Train AUC","Test AUC"]
x.add_row(["Custom W2V","XGBoost", 4, 32, 0.71,0.58])
print(x)
Adding on to the above observations: